Load libraries

library(phyloseq)
library(ggplot2)
library(dplyr)
library(scales)
library(grid)
library(reshape2)
library(gridExtra)
library(vegan)
library(cowplot)
library(gtable)
library(pander)
library(tidyr)

Global

## objects and functions that will be useful throughout this analysis

# Set the ggplot theme
theme_set(theme_bw())

# Color palette for stations
station_colors = c("red", "#ffa500", "#0080ff")

# Function to order date levels correctly
order_dates_aug11 <- function(df) {
  df$Date <- factor(df$Date, 
    levels = c("6/16","6/30","7/8","7/14","7/21",
      "7/29","8/4","8/11","8/18","8/25","9/2","9/8","9/15",
      "9/23","9/29","10/6","10/15","10/20","10/27"))
  return(df)
}

order_dates <- function(df) {
 df$Date <- factor(df$Date,
   levels = c("6/16","6/30","7/8","7/14","7/21",
     "7/29","8/4", "8/18","8/25","9/2","9/8","9/15",
     "9/23","9/29","10/6","10/15","10/20","10/27"))
 return(df)
}

named_list <- function(...){
    names <- as.list(substitute(list(...)))[-1L]
    result <- list(...)
    names(result) <- names
    result
}

# Source some useful functions for data normalizatio
source("~/git_repos/MicrobeMiseq/R/miseqR.R")

Load Data

load("~/git_repos/chabs/miseq_may2015/analysis/data/erie-data.RData")

# Inspect erie phyloseq object
erie
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 7192 taxa and 53 samples ]
## sample_data() Sample Data:       [ 53 samples by 32 sample variables ]
## tax_table()   Taxonomy Table:    [ 7192 taxa by 9 taxonomic ranks ]

Normalization

Scale counts to an even depth by dividing by total reads and multiplying by the minimum library size

## Scale reads in OTU table to even depth 
depth = 15000

erie_scale <- 
  erie %>%
    scale_reads(n = depth, round = "round") 

Figure 1: Bloom temporal dynamics

Create figure 1, showing pigment concentrations, toxin concentration, and proportion of cyanobacterial reads over time and stations.

# Import metadata file with nutrients, pigments and toxin
nutrient <- read.csv("~/git_repos/chabs/miseq_may2015/analysis/data/nutrient_cleaned.csv")

# Format nutrient data
nutrient_sub <- 
  nutrient %>%
    filter(!(Date %in% c("5/27", "6/10", "11/3"))) %>%
    order_dates_aug11() 

# Calculate relative abundance of Cyanobacteria at each date
cyano_abundance <- 
  erie_scale %>%
    tax_glom(taxrank = "Phylum") %>%                        # conglomerate OTUs to phylum level
    transform_sample_counts(function(x) {x/sum(x)} ) %>%    # transform to relative abundance
    subset_taxa(Phylum == "Cyanobacteria") %>%              # Subset to just Cyanobacteria
    psmelt() %>%                                            # melt phyloseq object
    rename(Cyanobacteria = Abundance) %>%
    select(Cyanobacteria, Date, Station)

# Merge cyanobacteria data with nutrient df
bloom_df <- 
  nutrient_sub %>%
    left_join(cyano_abundance, by = c("Station", "Date")) %>%
    mutate(Phycocyanin = ifelse(Phycocyanin > 80, 80, Phycocyanin)) %>%   # lower extreme values to plot better
    select(Station, Date, Phycocyanin, Chla, ParMC, Cyanobacteria) %>%
    melt(id.vars = c("Station", "Date")) %>%
    order_dates_aug11()

# Make a faceted ggplot of the four bloom variables over time and grouped by station
bloom_plots <- ggplot(bloom_df, 
  aes(x = Date, y = value, group = Station, color = Station, shape = Station)
) +
  facet_grid(variable~., scales = "free_y") +
  geom_point(size = 1.3) +
  geom_line(size = 1) + 
  ylab("") +
  scale_x_discrete(
    breaks = c("7/8", "8/4", "9/2", "10/6"),
    labels = c("Jul", "Aug", "Sep", "Oct"),
    drop = FALSE
  ) +
  scale_color_manual(values = station_colors) + 
  theme(
    strip.background = element_blank(),
    strip.text = element_text(size = 11),
    axis.title.x = element_blank()
  )

# function to extract a legend from a ggplot object
grab_legend <- function(a_ggplot) {
    tmp <- ggplot_gtable(ggplot_build(a_ggplot))
    leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
    legend <- tmp$grobs[[leg]]
    legend
}

station_legend <- grab_legend(bloom_plots) 

bloom_plots

Figure 1 Statistics

par(mfrow = c(1,3))
hist(nutrient$Chla)
hist(nutrient$Phycocyanin)
hist(nutrient$ParMC)

These distributions are very non-normal. Let’s try log-scaling

par(mfrow = c(1,3))
hist(log(nutrient$Chla), xlab = "log Chla", main = "")
hist(log(nutrient$Phycocyanin), xlab = "log Phycocyanin", main = "")
hist(log(nutrient$ParMC), xlab = "log ParMC", main = "")

These look better, so we will use these transformed variables for our correlation tests. Since phycocyanin and parmc both have zeroes, we will add a constant to all values. We selected this constant to be small and to minimally impact the distribution shape

par(mfrow = c(1,2))

logChla <- log(nutrient$Chla)
logPhyco <- log(nutrient$Phycocyanin + 0.1) 
logParMC <- log(nutrient$ParMC + 0.1) 

hist(logPhyco, main = "", xlab = "log Phyco + 0.1")
hist(logParMC, main = "", xlab = "log ParMC + 0.1")

plot(logChla, logPhyco, xlab = "log Chla", ylab = "log Phycocyanin") 

# Calculate correlation between Chla and phycocyanin for all sites
cor.test(
  x = logChla, 
  y = logPhyco, 
  alternative = "two.sided", 
  method = "pearson"
)
## 
##  Pearson's product-moment correlation
## 
## data:  logChla and logPhyco
## t = 10.175, df = 55, p-value = 2.985e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6936187 0.8828031
## sample estimates:
##       cor 
## 0.8081294

It looks like there is a pretty close correlation between chl a and phycocyanin measurements.

par(mfrow = c(1,2))
plot(logChla, logParMC)
plot(logPhyco, logParMC)

cor.test(
  x = logChla, 
  y = logParMC, 
  alternative = "two.sided", 
  method = "pearson"
)
## 
##  Pearson's product-moment correlation
## 
## data:  logChla and logParMC
## t = 6.2612, df = 55, p-value = 6.071e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4622297 0.7753393
## sample estimates:
##       cor 
## 0.6451002
cor.test(
  x = logPhyco, 
  y = logParMC, 
  alternative = "two.sided", 
  method = "pearson"
)
## 
##  Pearson's product-moment correlation
## 
## data:  logPhyco and logParMC
## t = 11.949, df = 55, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7565981 0.9089839
## sample estimates:
##       cor 
## 0.8496596

The statistical tests show that there are significant correlations between pigments and toxin, but the plots show that there are several dates in which the toxin levels are 0 but pigments are high, indicating that pigments are not always predictive of toxicity.

# add log scaled measurements to nutrient data frame

Figure 2: Cyano OTUs

In this figure we will display in part A a barplot with the relative abundance of cyanobacteria genera over time and site. In part B we will break up these groups into OTUs and show lineplots for each non-rare OTU over time (mean rel abundance > 0.0001) and site.

## Select only cyano OTUs that have mean relative abundace > 0.0001
n = 15000
thresh = 0.0005

# Calculate mean relative abundance for each OTU
tax_mean <- taxa_sums(erie_scale)/nsamples(erie_scale)

# Prune low abundance taxa using thresh as mean relative abundance
erie_prune_0001 <- 
  erie_scale %>%
    prune_taxa(tax_mean > thresh*n, .)


# Create a melted data frame of selected cyanobacteria OTUs
cyano_otus <- 
  erie_prune_0001 %>%
    transform_sample_counts(function(x) {x/sum(x)}) %>%
    subset_taxa(Class == "Cyanobacteria") %>%
    psmelt() 
################# Plot A #######################

cyano_genus <-
  cyano_otus %>%
    group_by(Genus) %>%
    arrange(Genus) %>%
    order_dates()


plot2a <- ggplot(cyano_genus, aes(x = Date, y = Abundance, fill = Genus)) +
  facet_grid(~Station) +
  geom_bar(stat = "identity") +
  ylab("rel. abundance") + 
  scale_x_discrete(
      breaks = c("7/8", "8/4", "9/2", "10/6"),
      labels = c("Jul", "Aug", "Sep", "Oct"),
      drop = FALSE
    ) +
  scale_fill_brewer(palette = "Paired") + 
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 9),
    plot.title = element_text(size = 10, face = "bold"),
    strip.background = element_blank()
  ) 

genus_legend <- grab_legend(plot2a)

plot2a <- plot2a + theme(legend.position = "none")
# Function to make a ggplot lineplot of an OTU's relative abundance over time
#
# Args:
#   df: a melted data frame generated by calling psmelt() on a phyloseq object. 
#       Contains an "Abundance" column for the OTU's abundance 
#   otu: the OTU to generate a lineplot for
#   taxrank: the taxonomic rank to appear in the plot title (e.g. "Genus")
# Returns:
#   a ggplot lineplot
plot_otus <- function(df, otu, taxrank) {
  ggplot(df, 
    aes(x = Date, y = Abundance, group = Station, color = Station, shape = Station)) +
    geom_point(size = 2) +
    geom_line(size = 0.7) +
    ggtitle(paste(df[1, taxrank], otu)) +
    ylab("rel. abund") + 
    scale_color_manual(values = station_colors) +
    scale_x_discrete(
      breaks = c("7/8", "8/4", "9/2", "10/6"),
      labels = c("Jul", "Aug", "Sep", "Oct"),
      drop = FALSE
    ) +
    theme(
      axis.title.x = element_blank(),
      axis.title.y = element_text(size = 9),
      legend.position = "none",
      plot.title = element_text(size = 10, face = "bold")
    ) 
}



##################### Plot B #############################

cyano_otu_names <- as.list(levels(cyano_otus$Species))
names(cyano_otu_names) <- levels(cyano_otus$Species)


# Generate a lineplot for each cyanobacteria with mean relative abundance > 0.0001
cyano_otu_plots <- lapply(cyano_otu_names, 
  function(otu) {
    df_otu <- filter(cyano_otus, OTU == otu)
    plot <- plot_otus(df = df_otu, otu = otu, taxrank = "Genus") 
    return(plot)
  }
)

plot2b <- arrangeGrob(
  cyano_otu_plots$Otu00007, cyano_otu_plots$Otu00177, cyano_otu_plots$Otu00005,
  cyano_otu_plots$Otu00044, cyano_otu_plots$Otu00304, cyano_otu_plots$Otu00037, 
  cyano_otu_plots$Otu00147, cyano_otu_plots$Otu00193, cyano_otu_plots$Otu00049,
  ncol = 3, nrow = 3
)
##################### Compile plots #####################################

ggdraw() +
  draw_plot(plot2a, x = 0.02, y = 0.72, width = .8, height = 0.28) +
  draw_plot(genus_legend, x = 0.84, y = .79, width = .16, height = .18) + 
  draw_plot(plot2b, x = 0.02, y = 0, width = .8, height = 0.67) +
  draw_plot(station_legend, x = 0.82, y = .5, width = .18, height = .18) + 
  draw_plot_label(c("A", "B"), c(0, 0), c(1, 0.5), size = 14)

Figure 3: Alpha diversity

# My own subsetting function, similar to phyloseq::subset_taxa, except taxa can 
# be passed as arguments within functions without weird environment errors
#
# Args:
#   physeq: a phyloseq object
#   taxrank: taxonomic rank to filter on
#   taxa: a vector of taxa groups to filter on
#
# Returns: 
#   a phyloseq object subsetted to the x taxa in taxrank
my_subset_taxa <- function(physeq, taxrank, taxa) {
  physeq_tax_sub <- tax_table(physeq)[tax_table(physeq)[ , taxrank] %in% taxa, ]
  tax_table(physeq) <- physeq_tax_sub
  return(physeq)
}

Here we calculate the inverse simpson index for each bacterial group as well as all NcBacteria

mytaxa <- c(
  "Actinobacteria", "Alphaproteobacteria",
  "Betaproteobacteria", "Bacteroidetes", 
  "Gammaproteobacteria", "Deltaproteobacteria", 
  "Verrucomicrobia"
)
names(mytaxa) <- mytaxa

# Taxonomic ranks of mytaxa
mytaxa_taxrank <- c(
  "Phylum", "Class", "Class", 
  "Phylum", "Class", "Class", "Phylum"
)
names(mytaxa_taxrank) <- mytaxa

# For each bacterial group, subset the phyloseq object and calculate the inverse simpson
invsimp_df <- Map(
  function(tax, rank) {
    # Subset to the taxonomic group
    erie_sub <- my_subset_taxa(physeq = erie, taxrank = rank, taxa = tax)
    # Calculate inverse simpson
    invsimp <- estimate_richness(erie_sub, measures = "InvSimpson")
  },
  mytaxa, mytaxa_taxrank
)


# Calculate inv simpson of all NcBacteria
ncbacteria <- subset_taxa(erie, Class != "Cyanobacteria")
invsimp_df$NcBacteria <- estimate_richness(ncbacteria, measures = "InvSimpson")
# Merge sample metadata with these estimates
bloom_dat <- data.frame(sample_data(erie)) %>%
  select(SampleID, Chla, Station, Date, Phycocyanin, ParMC, pH, TP) %>%
  order_dates()

nutrient <- order_dates(nutrient)
ggplot(nutrient, aes(x = Date, y = TP, group = Station, color = Station)) +
  geom_point() +
  geom_line()

plot(log(bloom_dat$TP), log(bloom_dat$Chla))

plot(log(bloom_dat$TP), log(bloom_dat$Phycocyanin))

plot(log(bloom_dat$TP), bloom_dat$pH)

plot(log(bloom_dat$Chla), bloom_dat$pH)

# Create a df with a "Diversity" column that includes richness and inv. simpson,
# and log-chl a values from erie sample_data
alphadiv <- as.data.frame(invsimp_df)
names(alphadiv) <- names(invsimp_df)
alphadiv$SampleID <- rownames(alphadiv)

alphadiv_df <- alphadiv %>% 
  melt(id.vars = "SampleID", variable.name = "Group", value.name = "InvSimp") %>%
  left_join(sample_data(erie), by = c("SampleID" = "SampleID"))

Explore relationships between alpha-diversity and pH

alphadiv_ph_plots <- lapply(levels(alphadiv_df$Group), function(x) {
  df <- filter(alphadiv_df, Group == x)
  ggplot(df, aes(x = pH, y = InvSimp)) +
    geom_point() +
    geom_smooth(method = "lm") +
    ggtitle(x)
})

do.call(grid.arrange, alphadiv_ph_plots)

# Betaproteo
betas <- filter(alphadiv_df, Group == "Betaproteobacteria")
cor.test(betas$InvSimp, betas$pH)
## 
##  Pearson's product-moment correlation
## 
## data:  betas$InvSimp and betas$pH
## t = 4.3644, df = 48, p-value = 6.747e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2990222 0.7065324
## sample estimates:
##       cor 
## 0.5330066
# Bacteroidetes
bacteroidetes <- filter(alphadiv_df, Group == "Bacteroidetes")
cor.test(bacteroidetes$InvSimp, bacteroidetes$pH)
## 
##  Pearson's product-moment correlation
## 
## data:  bacteroidetes$InvSimp and bacteroidetes$pH
## t = 3.3108, df = 48, p-value = 0.001772
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1736684 0.6334917
## sample estimates:
##       cor 
## 0.4311732

Relationship between alpha-diversity and Chla

alphadiv_chla_plots <- lapply(levels(alphadiv_df$Group), function(x) {
  df <- filter(alphadiv_df, Group == x)
  ggplot(df, aes(x = log(Chla), y = InvSimp)) +
    geom_point() +
    geom_smooth(method = "lm") +
    ggtitle(x)
})

do.call(grid.arrange, alphadiv_chla_plots)

# Betaproteo
betas <- filter(alphadiv_df, Group == "Betaproteobacteria")
cor.test(betas$InvSimp, log(betas$Chla))
## 
##  Pearson's product-moment correlation
## 
## data:  betas$InvSimp and log(betas$Chla)
## t = 6.3848, df = 51, p-value = 5.096e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4832813 0.7937984
## sample estimates:
##       cor 
## 0.6665102
# Bacteroidetes
bacteroidetes <- filter(alphadiv_df, Group == "Bacteroidetes")
cor.test(bacteroidetes$InvSimp,log(bacteroidetes$Chla))
## 
##  Pearson's product-moment correlation
## 
## data:  bacteroidetes$InvSimp and log(bacteroidetes$Chla)
## t = 4.9682, df = 51, p-value = 7.985e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3557001 0.7288717
## sample estimates:
##       cor 
## 0.5710875

Relationship between alpha-diversity and Phycocyanin

alphadiv_phyco_plots <- lapply(levels(alphadiv_df$Group), function(x) {
  df <- filter(alphadiv_df, Group == x)
  ggplot(df, aes(x = log(Phycocyanin + 0.1), y = InvSimp)) +
    geom_point() +
    geom_smooth(method = "lm") +
    ggtitle(x)
})

do.call(grid.arrange, alphadiv_phyco_plots)

# Betaproteo
betas <- filter(alphadiv_df, Group == "Betaproteobacteria")
cor.test(betas$InvSimp, log(betas$Phycocyanin + 0.1))
## 
##  Pearson's product-moment correlation
## 
## data:  betas$InvSimp and log(betas$Phycocyanin + 0.1)
## t = 4.6294, df = 51, p-value = 2.556e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3208331 0.7098776
## sample estimates:
##       cor 
## 0.5439555
# Bacteroidetes
bacteroidetes <- filter(alphadiv_df, Group == "Bacteroidetes")
cor.test(bacteroidetes$InvSimp, log(betas$Phycocyanin + 0.1))
## 
##  Pearson's product-moment correlation
## 
## data:  bacteroidetes$InvSimp and log(betas$Phycocyanin + 0.1)
## t = 4.6865, df = 51, p-value = 2.105e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3268212 0.7131804
## sample estimates:
##       cor 
## 0.5486486

Relationship between alpha-diversity and TP

alphadiv_TP_plots <- lapply(levels(alphadiv_df$Group), function(x) {
  df <- filter(alphadiv_df, Group == x)
  ggplot(df, aes(x = log(TP), y = InvSimp)) +
    geom_point() +
    geom_smooth(method = "lm") +
    ggtitle(x)
})

do.call(grid.arrange, alphadiv_TP_plots)

# Betaproteo
betas <- filter(alphadiv_df, Group == "Betaproteobacteria")
cor.test(betas$InvSimp, log(betas$TP))
## 
##  Pearson's product-moment correlation
## 
## data:  betas$InvSimp and log(betas$TP)
## t = 5.7827, df = 51, p-value = 4.482e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4325907 0.7688222
## sample estimates:
##       cor 
## 0.6293024
# Bacteroidetes
bacteroidetes <- filter(alphadiv_df, Group == "Bacteroidetes")
cor.test(bacteroidetes$InvSimp, log(betas$TP))
## 
##  Pearson's product-moment correlation
## 
## data:  bacteroidetes$InvSimp and log(betas$TP)
## t = 5.2149, df = 51, p-value = 3.373e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3800171 0.7417871
## sample estimates:
##       cor 
## 0.5897355

Relationship between alpha-diversity and Temp

alphadiv_temp_plots <- lapply(levels(alphadiv_df$Group), function(x) {
  df <- filter(alphadiv_df, Group == x)
  ggplot(df, aes(x = Temp, y = InvSimp)) +
    geom_point() +
    geom_smooth(method = "lm") +
    ggtitle(x)
})

do.call(grid.arrange, alphadiv_temp_plots)

# Verrucomicrobia
verr <- filter(alphadiv_df, Group == "Verrucomicrobia")
cor.test(verr$InvSimp, verr$Temp)
## 
##  Pearson's product-moment correlation
## 
## data:  verr$InvSimp and verr$Temp
## t = -7.6644, df = 51, p-value = 4.879e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.8364992 -0.5750517
## sample estimates:
##        cor 
## -0.7316266
# Gamma
gamm <- filter(alphadiv_df, Group == "Gammaproteobacteria")
cor.test(gamm$InvSimp, gamm$Temp)
## 
##  Pearson's product-moment correlation
## 
## data:  gamm$InvSimp and gamm$Temp
## t = -6.0812, df = 51, p-value = 1.53e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.7816459 -0.4583543
## sample estimates:
##        cor 
## -0.6483267

Seasonal alpha diversity trends

alphadiv_ncbacteria <- filter(alphadiv_df, Group == "NcBacteria") %>%
  order_dates()

ggplot(alphadiv_ncbacteria, aes(x = Date, y = InvSimp, group = Station, color = Station, shape = Station)) +
  geom_point() +
  geom_line() +
  scale_color_manual(values = station_colors) 

alphadiv_ncbacteria <- filter(alphadiv_df, Group == "Bacteroidetes") %>%
  order_dates()

ggplot(alphadiv_ncbacteria, aes(x = Date, y = InvSimp, group = Station, color = Station, shape = Station)) +
  geom_point() +
  geom_line() +
  scale_color_manual(values = station_colors) 

alphadiv_ncbacteria <- filter(alphadiv_df, Group == "Betaproteobacteria") %>%
  order_dates()

ggplot(alphadiv_ncbacteria, aes(x = Date, y = InvSimp, group = Station, color = Station, shape = Station)) +
  geom_point() +
  geom_line() +
  scale_color_manual(values = station_colors) 

alphadiv_ncbacteria <- filter(alphadiv_df, Group == "NcBacteria") %>%
  order_dates()

plot3a <- ggplot(alphadiv_ncbacteria, aes(x = Date, y = InvSimp, group = Station, color = Station, shape = Station)) +
  geom_point() +
  geom_line() +
  scale_color_manual(values = station_colors) +
  xlab("") +
  scale_x_discrete(
      breaks = c("7/8", "8/4", "9/2", "10/6"),
      labels = c("Jul", "Aug", "Sep", "Oct"),
      drop = FALSE
  ) 

ggdraw() +
  draw_plot(plot3a, x = 0.02,  y = .5, width = .98, height = .49) +
  draw_plot(alphadiv_chla_plots[[8]], x = 0.02,  y = 0, width = .3, height = .48) +  
  draw_plot(alphadiv_chla_plots[[3]], x = 0.34,  y = 0, width = .3, height = .48) +
  draw_plot(alphadiv_chla_plots[[4]], x = 0.66,  y = 0, width = .3, height = .48) +
  draw_plot_label(c("A", "B"), c(0, 0), c(1, 0.5), size = 14)

Figure 4: PCoA analyses

PCoA ordination plots

# Subset to cyanobacteria and scale internally
cyanos <- 
  erie %>%
    subset_taxa(Class == "Cyanobacteria") %>%
    scale_reads(round = "round")


# Subset to non-cyanobacteria and scale internally
non_cyanos <- 
  erie %>%
    subset_taxa(Class != "Cyanobacteria") %>%
    scale_reads(round = "round")

# Make a list of phyloseq objects for the full community, cyanos, and non-cyanos
physeq_subsets <- list(erie_scale, cyanos, non_cyanos)
names(physeq_subsets) <- c("full", "Cyanobacteria", "NcBacteria")


# Generate pcoa scores for each subset
pcoas <- lapply(physeq_subsets, 
  function(x) {
    ordinate(
      physeq = x,
      method = "PCoA",
      distance = "bray"
    )
  }
)


# Generate a df to plot pcoa for each subset
pcoa_dfs <- lapply(pcoas,
  function(x, names) {
    p <- plot_ordination(
      physeq = erie_scale,
      axes = 1:3,
      ordination = x,
      justDF = TRUE
    )
    p$Month <- factor(p$Month, 
      levels = c("June", "July", "August", "September", "October"))
    p <- p %>% 
      rename(PC1 = Axis.1, PC2 = Axis.2, PC3 = Axis.3) %>%
      order_dates()
    return(p)
  }
)

# Flip orientation of PC2 for Cyanobacteria (does not affect interpretation)
pcoa_dfs$NcBacteria$PC2 <- -pcoa_dfs$NcBacteria$PC2

# Generate relative, lingoes-corrected eigenvalues for PC1 and PC2
eigs <- lapply(pcoas,
  function(x) {
    pcs <- c(PC1 = signif(x$values$Rel_corr_eig[1]*100, 3),
             PC2 = signif(x$values$Rel_corr_eig[2]*100, 3),
             PC3 = signif(x$values$Rel_corr_eig[3]*100, 3)
            )
    return(pcs)
  }
)


pcoa_plots <- Map(
  function(x, n) {
    ggplot(data = x, aes(x = PC1, y = PC2, 
           color = Station, shape = Station)) +
      geom_point(aes(alpha = Month), size = 2.5) + 
      scale_color_manual(values = station_colors) + 
      xlab(paste("PC1 ", eigs[[n]][1], "%")) +
      ylab(paste("PC2 ", eigs[[n]][2], "%")) +
      theme(plot.margin = unit(c(0, 0.2, 0, 0.2), "cm"))
  }, 
  pcoa_dfs, names(pcoa_dfs)
)

    
# Extract legend
pcoa_legend <- grab_legend(pcoa_plots$full)

# Remove legend from plots
pcoa_plots <- lapply(pcoa_plots, 
  function(x) {x + theme(legend.position = "none")}
)
# Function to create a plot of time (x-axis) vs PC scores (y-axis)
plot_pcts <- function(df, pc, eigs) {
  ggplot(df, 
    aes_string(x = "Date", y = pc, group = "Station", color = "Station", shape = "Station")) +
      geom_point(size = 2.5) +
      geom_line(size = 1.1) + 
      scale_color_manual(values = station_colors) +
      scale_x_discrete(
        breaks = c("7/8", "8/4", "9/2", "10/6"),
        labels = c("Jul", "Aug", "Sep", "Oct"),
        drop = FALSE
      ) +  
      ylab(paste(pc, " ", eigs[pc], "%")) +
      theme(
        axis.title.x = element_blank(),
        plot.title = element_text(face = "bold", size = 16),
        legend.position = "none",
        plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm")
      ) 
}

# Non-cyano PC time series plots
non_cyano_pcs <- named_list("PC1", "PC2", "PC3")

non_cyano_pc_plots <- lapply(non_cyano_pcs,
  function(x) {
    plot_pcts(pcoa_dfs$NcBacteria, x, eigs = eigs$NcBacteria)
  }
)
# Plot for pH
ph_plot <- ggplot(pcoa_dfs$NcBacteria, aes(x = Date, y = pH, group = Station, shape = Station, color = Station)) +
  geom_line()

# Plot for Temperature
temp_plot <- ggplot(pcoa_dfs$NcBacteria, aes(x = Date, y = Temp, group = Station, shape = Station, color = Station)) +
  geom_line()

# Plot for pH
tp_plot <- ggplot(pcoa_dfs$NcBacteria, aes(x = Date, y = TP, group = Station, shape = Station, color = Station)) +
  geom_line()

plot(pcoa_dfs$NcBacteria$PC1, log(pcoa_dfs$NcBacteria$TP))

plot(pcoa_dfs$NcBacteria$PC1, pcoa_dfs$NcBacteria$pH)

plot(pcoa_dfs$NcBacteria$PC2, pcoa_dfs$NcBacteria$Temp)

cor.test(pcoa_dfs$NcBacteria$PC1, log(pcoa_dfs$NcBacteria$TP), method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  pcoa_dfs$NcBacteria$PC1 and log(pcoa_dfs$NcBacteria$TP)
## t = 6.784, df = 51, p-value = 1.194e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5141512 0.8085122
## sample estimates:
##       cor 
## 0.6887308
cor.test(pcoa_dfs$NcBacteria$PC1, pcoa_dfs$NcBacteria$pH, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  pcoa_dfs$NcBacteria$PC1 and pcoa_dfs$NcBacteria$pH
## t = 5.6315, df = 48, p-value = 9.118e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4274487 0.7733266
## sample estimates:
##       cor 
## 0.6307502

Compiled plot

ggdraw() +
  ## non-cyano
  draw_plot(non_cyano_pc_plots$PC1, x = 0.02, y = 0.5, width = 0.3, height = 0.42) +
  draw_plot(non_cyano_pc_plots$PC2, x = 0.34, y = 0.5, width = 0.3, height = 0.42) +
  draw_plot(non_cyano_pc_plots$PC3, x = 0.66, y = 0.5, width = 0.3, height = 0.42) 

how good is my PCoA in two dimensions?

pcoa_dist <- as.vector(dist(pcoas$NcBacteria$vectors[,1:2]))
bray_dist <- as.vector(phyloseq::distance(non_cyanos, method = "bray"))

 plot(bray_dist, pcoa_dist) +
   abline(lm(pcoa_dist~bray_dist), col = "blue")

## numeric(0)
 summary(lm(pcoa_dist~bray_dist))
## 
## Call:
## lm(formula = pcoa_dist ~ bray_dist)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.280556 -0.034510  0.008791  0.045607  0.133647 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.284515   0.006871  -41.41   <2e-16 ***
## bray_dist    1.249986   0.013976   89.44   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06363 on 1376 degrees of freedom
## Multiple R-squared:  0.8532, Adjusted R-squared:  0.8531 
## F-statistic:  7999 on 1 and 1376 DF,  p-value: < 2.2e-16

ordisurf

ordi <- ordisurf(pcoas$NcBacteria$vectors ~ sample_data(non_cyanos)$pH)

ordi_grid <- ordi$grid
ordi_ph <- expand.grid(x = ordi_grid$x, y = ordi_grid$y) #get x and ys
ordi_ph$z <- as.vector(ordi_grid$z) #unravel the matrix for the z scores
ordi_ph <- data.frame(na.omit(ordi_ph)) #gets rid of the nas

ggplot(pcoa_dfs$NcBacteria, aes(x = PC1, y = PC2)) +
  geom_point() +
  stat_contour(data = ordi_ph, aes(x = x, y = y, z = z, binwidth = 4))

http://www.fromthebottomoftheheap.net/2011/06/10/what-is-ordisurf-doing/

n <- pcoas$NcB
ordisurf(pcoas$NcBacteria$vectors ~ sample_data(non_cyanos)$pH, main = "pH", method = "ML", select = TRUE)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## y ~ s(x1, x2, k = 10, bs = "tp", fx = FALSE)
## 
## Estimated degrees of freedom:
## 6.54  total = 7.54 
## 
## ML score: -7.987996
ordisurf(pcoas$NcBacteria$vectors ~ sample_data(non_cyanos)$Temp, main = "Temp")

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## y ~ s(x1, x2, k = 10, bs = "tp", fx = FALSE)
## 
## Estimated degrees of freedom:
## 7.14  total = 8.14 
## 
## REML score: 102.094
ordisurf(pcoas$NcBacteria$vectors ~ sample_data(non_cyanos)$TP, main = "TP")

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## y ~ s(x1, x2, k = 10, bs = "tp", fx = FALSE)
## 
## Estimated degrees of freedom:
## 3.64  total = 4.64 
## 
## REML score: 232.063
ordisurf(pcoas$NcBacteria$vectors ~ sample_data(non_cyanos)$Chla, main = "Chla")

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## y ~ s(x1, x2, k = 10, bs = "tp", fx = FALSE)
## 
## Estimated degrees of freedom:
## 4.92  total = 5.92 
## 
## REML score: 202.2073
ordisurf(pcoas$NcBacteria$vectors ~ sample_data(non_cyanos)$Turbidity, main = "Turbidity", method = "ML", select = TRUE)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## y ~ s(x1, x2, k = 10, bs = "tp", fx = FALSE)
## 
## Estimated degrees of freedom:
## 1.92  total = 2.92 
## 
## ML score: 170.5032

Boostrapped models

library(leaps)

# Function to extract the best subset multiple linear regression model 
#
# Args:
#   vars: vectors of all variables to consider in the model
#   response: response variable of the model
#   dat: dataframe with vars and response
#
# Returns: a list with the variables in the best model, bic, cp, and adjusted r2
get_bestsub_summary <- function(vars, response, dat) {
  formula = reformulate(termlabels = vars, response = response)
  lm_model <- regsubsets(formula, method = "forward", dat)
  bic <- summary(lm_model)$bic
  cp <- summary(lm_model)$cp
  adjr2 <- summary(lm_model)$adjr2
  best_model <- summary(lm_model)$which[which.min(bic), ]
  return(list(model = best_model, bic = bic, cp = cp, adjr2 = adjr2))
}

# Variables to include in cyano models
cyano_vars <- c("Nitrate", "SRP", "Temp", "H2O2", "SpCond", "Ammonia", "Turbidity", "Days", "TP")

# Variables to include in nc-bacteria models
non_cyano_vars <- c(cyano_vars, "pH",  "ParMC", "Chla")

# Impute SpCond values for nearshore 1 on Sep 2 and Sep 8 with value for nearshore 2
pcoa_dfs_impute <- lapply(pcoa_dfs, 
  function(x) {
    # Change 9/2 value
    x$SpCond[x$Date == "9/2" & x$Station == "nearshore1"] <- 
      x$SpCond[x$Date == "9/2" & x$Station == "nearshore2"]
    # Change 9/8 value
     x$SpCond[x$Date == "9/8" & x$Station == "nearshore1"] <- 
      x$SpCond[x$Date == "9/8" & x$Station == "nearshore2"]
    return(x)
  }
)

Non-cyano models

# get the variables best subset model for the non-cyano community and then 
# fit the model to extract coefficients and p-values
non_cyano_models <- lapply(non_cyano_pcs, 
  function(x) {
    best_model <- get_bestsub_summary(non_cyano_vars, x, dat = pcoa_dfs_impute$NcBacteria)
    model <- lm(
      formula = reformulate(non_cyano_vars[best_model$model[-1]], x), 
      data = pcoa_dfs_impute$NcBacteria
    )
    return(model)
  }
)
set.seed(1)

boot_models <- list()

for (i in 1:100) {
  boot_sample <- sample(pcoa_dfs$NcBacteria$SampleID, replace = TRUE)
  newdf <- pcoa_dfs_impute$NcBacteria[boot_sample, ]
  best_model <- get_bestsub_summary(non_cyano_vars, "PC1", dat = newdf)
  model <- lm(
    formula = reformulate(non_cyano_vars[best_model$model[-1]], "PC1"), 
    data = newdf
  )
  boot_models[[i]] <- model
}

How many models had pH in them, and how frequently was it the most significant term?

n = 15000
thresh = 0.0001

# Calculate mean relative abundance for each OTU
tax_mean <- taxa_sums(non_cyanos)/nsamples(non_cyanos)

# Prune low abundance taxa using thresh as mean relative abundance
nc_prune <- 
  non_cyanos %>%
    prune_taxa(tax_mean > thresh*n, .)

Species <- tax_table(nc_prune)[,"Species"]

cors <- list()

for (species in Species) {
  otu_abun <- as.numeric(otu_table(nc_prune)[species, ])
  cors[[species]] <- cor.test(otu_abun, pcoa_dfs$NcBacteria$PC1, method = "pearson")
}

p_values <- lapply(cors, function(x) {x$p.value})

w <- which(unlist(p_values) < (0.01/385))

r_values <- lapply(cors, function(x) {
  y <- x$estimate
  names(y) <- NULL
  return(y)
})

pos <- which(unlist(r_values) > 0)
neg <- which(unlist(r_values) < 0)

pc1_pos_taxa <- intersect(names(w), names(pos))

pc1_neg_taxa <- intersect(names(w), names(neg))

pc1_postax_table <- tax_table(erie)[pc1_pos_taxa,]
pc1_negtax_table <- tax_table(erie)[pc1_neg_taxa,]

p1 <- plot(pcoa_dfs$NcBacteria$PC1, as.numeric(otu_table(nc_prune)["Otu00002",]), xlab = "PC1 score", ylab = "Otu00002 LHab A1", main = paste("r:", round(cors$Otu00002$estimate,2)))

p2 <- plot(pcoa_dfs$NcBacteria$PC1, as.numeric(otu_table(nc_prune)["Otu00004",]),xlab = "PC1 score", ylab = "Otu00004 acI-B", main = paste("r:", round(cors$Otu00004$estimate,2)))

p3 <- plot(pcoa_dfs$NcBacteria$PC1, as.numeric(otu_table(nc_prune)["Otu00006",]),xlab = "PC1 score", ylab = "Otu00006 acI-C", main = paste("r:", round(cors$Otu00006$estimate,2)))

p4 <- plot(pcoa_dfs$NcBacteria$PC1, as.numeric(otu_table(nc_prune)["Otu00011",]),xlab = "PC1 score", ylab = "Otu00011 acI-A", main = paste("r:", round(cors$Otu00011$estimate,2)))

p5 <- plot(pcoa_dfs$NcBacteria$PC1, as.numeric(otu_table(nc_prune)["Otu00075",]),
xlab = "PC1 score", ylab = "Otu00075 alfII-A", main = paste("r:", round(cors$Otu00075$estimate,2)))

p6 <- plot(pcoa_dfs$NcBacteria$PC1, as.numeric(otu_table(nc_prune)["Otu00073",]),xlab = "PC1 score", ylab = "Otu00073 Planctomyces", main = paste("r:", round(cors$Otu00073$estimate,2)))

par(mfrow = c(2,3))